This section will focus on: 1. How to engineer a graph plot: + Igraph’s Plot + ggraph 2. Useful tips on how to effectively use visualization for graph analysis: + Filter + Aggregate + Query
library(tidyverse)
library(ggraph)
library(igraph)
library(igraphdata)
library(magrittr)
Before we try to tackle a big dataset, let’s try to plot a graph that we know has structure.
set.seed(4321)
tree_g <- make_tree(40, 4) %>%
set_edge_attr('weight', value = sample(1:3, ecount(.), replace = T)) %>%
set_vertex_attr('degree', value = degree(.))
set.seed(1234)
plot(tree_g)
set.seed(1234)
plot(tree_g,
vertex.size = 5,
vertex.label = 'node',
vertex.label.cex = .75,
edge.width = 2,
edge.arrow.size = .5)
set.seed(1234)
plot(tree_g,
vertex.size = V(tree_g)$degree * 3,
vertex.label = ifelse(1:vcount(tree_g) %% 2, 'node', 'vertex'),
vertex.label.cex = sample(c(.5, 1), vcount(tree_g), replace = T),
edge.width = E(tree_g)$weight,
edge.arrow.size = .5)
set.seed(1234)
plot(tree_g,
vertex.label = NA,
vertex.size = 10,
vertex.color = 'hotpink',
edge.color = 'midnightblue',
edge.arrow.size = .5)
set.seed(1234)
plot(tree_g,
vertex.label = NA,
vertex.size = 10,
vertex.color = map_chr(V(tree_g)$degree, function(x){switch(as.character(x), "4" = 'red', "5" = 'white', "1" = 'blue')}),
edge.color = map_chr(E(tree_g)$weight, function(x){rgb(0, 0, 1, x/3)}),
edge.arrow.size = .5)
V(tree_g)$color = map_chr(V(tree_g)$degree, function(x){switch(as.character(x), "4" = 'red', "5" = 'white', "1" = 'blue')})
V(tree_g)$size = V(tree_g)$degree * 4
V(tree_g)$label = NA
E(tree_g)$color = map_chr(E(tree_g)$weight, function(x){rgb(0, 0, 1, x/3)})
E(tree_g)$width = E(tree_g)$weight
E(tree_g)$arrow.size = .3
set.seed(1234)
plot(tree_g)
plot(tree_g, layout = layout_as_tree)
plot(tree_g, layout = layout_as_tree(tree_g, circular = T))
If you want to create your own matrix, then the first column of the matrix is the horizontal location (x) and the second column of the matrix is the vertical location (y). The default of igraph.plot is to make the x location dependent on the y location. If you do not want this behaviour, then change the aspect ration asp parameter to 0.
myLayout = cbind(c(vcount(tree_g)/2,
1:4,
map_dbl(1:16, function(x){vcount(tree_g) - x}),
map_dbl(1:19, function(x){vcount(tree_g)/2 - x})),
c(1, rep(vcount(tree_g)/4, 4), rep(vcount(tree_g)/2, 16), rep(vcount(tree_g), 19)))
plot(tree_g, layout = myLayout, asp = 0)
axis(1, at = c(-1, 0, 1), labels = c('0', 'vcount/2', 'vcount'))
axis(2, at = c(-1, 0, 1), labels = c('0', 'vcount/2', 'vcount'))
set.seed(1234)
ggraph(tree_g) +
geom_edge_fan() +
geom_node_point() +
theme_void()
set.seed(1234)
ggraph(tree_g) +
geom_edge_fan(colour = 'blue',
arrow = arrow()) +
geom_node_point(size = 5,
shape = 21,
fill = 'rosybrown') +
theme_void()
## Using `nicely` as default layout
set.seed(1234)
ggraph(tree_g) +
geom_edge_fan(colour = 'blue',
arrow = arrow(length = unit(3, 'mm')),
start_cap = circle(3, 'mm'),
end_cap = circle(3, 'mm')) +
geom_node_point(size = 5,
shape = 21,
fill = 'rosybrown') +
theme_void()
## Using `nicely` as default layout
set.seed(1234)
ggraph(tree_g) +
geom_edge_fan(aes(color = weight,
width = weight),
arrow = arrow(length = unit(3, 'mm')),
start_cap = circle(3, 'mm'),
end_cap = circle(3, 'mm')) +
geom_node_point(aes(size = degree,
fill = as.character(degree)),
shape = 21) +
theme_void()
set.seed(1234)
ggraph(tree_g) +
geom_edge_fan(aes(color = weight,
width = weight),
arrow = arrow(length = unit(3, 'mm')),
start_cap = circle(3, 'mm'),
end_cap = circle(3, 'mm')) +
geom_node_point(aes(size = degree,
fill = as.character(degree)),
shape = 21) +
theme_void() +
scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
scale_edge_width(range = c(.5, 1.5)) +
scale_size_area(max_size = 10) +
scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))
set.seed(1234)
ggraph(tree_g, layout = 'tree') +
geom_edge_fan(aes(color = weight,
width = weight),
arrow = arrow(length = unit(3, 'mm')),
start_cap = circle(3, 'mm'),
end_cap = circle(3, 'mm')) +
geom_node_point(aes(size = degree,
fill = as.character(degree)),
shape = 21) +
theme_void() +
scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
scale_edge_width(range = c(.5, 1.5)) +
scale_size_area(max_size = 10) +
scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))
set.seed(1234)
ggraph(tree_g, layout = 'tree', circular = T) +
geom_edge_fan(aes(color = weight,
width = weight),
arrow = arrow(length = unit(3, 'mm')),
start_cap = circle(3, 'mm'),
end_cap = circle(3, 'mm')) +
geom_node_point(aes(size = degree,
fill = as.character(degree)),
shape = 21) +
theme_void() +
scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
scale_edge_width(range = c(.5, 1.5)) +
scale_size_area(max_size = 10) +
scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))
set.seed(1234)
ggraph(tree_g, layout = 'manual', node.positions = data.frame(x = myLayout[,1], y = myLayout[,2])) +
geom_edge_fan(aes(color = weight,
width = weight),
arrow = arrow(length = unit(3, 'mm')),
start_cap = circle(3, 'mm'),
end_cap = circle(3, 'mm')) +
geom_node_point(aes(size = degree,
fill = as.character(degree)),
shape = 21) +
theme_void() +
scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
scale_edge_width(range = c(.5, 1.5)) +
scale_size_area(max_size = 10) +
scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))
data(enron)
enron
## IGRAPH 64ec693 D--- 184 125409 -- Enron email network
## + attr: LDC_names (g/c), LDC_desc (g/c), name (g/c), Citation
## | (g/c), Email (v/c), Name (v/c), Note (v/c), Time (e/c),
## | Reciptype (e/c), Topic (e/n), LDC_topic (e/n)
## + edges from 64ec693:
## [1] 25->154 25->154 30-> 30 30-> 30 30-> 30 30-> 30 39-> 39
## [8] 52-> 67 52-> 67 52-> 67 52-> 67 61->100 61->100 61->163
## [15] 61->163 61->166 61->166 61->170 64-> 59 64-> 59 64-> 64
## [22] 64-> 64 64->147 64->147 64->164 64->164 64->168 66-> 66
## [29] 66-> 66 67->129 67->129 67->129 67->129 93-> 10 93-> 10
## [36] 93-> 10 93-> 10 93-> 39 93-> 39 93-> 93 93-> 93 93-> 93
## + ... omitted several edges
graph_attr(enron) %>%
lapply(head)
## $LDC_names
## [1] "Calif_analysis" "Calif_bankruptcy" "Calif_utilities"
## [4] "Calif_crisis_legal" "Calif_enron" "Calif_federal"
##
## $LDC_desc
## [1] "Executive summaries and analyses about the California situation. (304 entries)"
## [2] "Specifically mentioned financial difficulties of the utilities such as Southern California Edison (SoCal Edison) and Pacific Gas & Electric (PG & E). (36 entries)"
## [3] "General references to California utility companies: Edison, Pacific Gas & Electric, and the California Public Utility Commission (CPUC) which regulates them. (116)"
## [4] "Articles about legal issues surrounding California energy crisis. (109)"
## [5] "Enron business emails about the day to day operations of managing the California side of their business. (699)"
## [6] "Emails about FERC (Federal Energy Regulatory Commission), U.S. Senate Hearings. (61)"
##
## $name
## [1] "Enron email network"
##
## $Citation
## [1] "C.E. Priebe, J.M. Conroy, D.J. Marchette, and Y. Park, \"Scan Statistics on Enron Graphs,\" Computational and Mathematical Organization Theory, Volume 11, Number 3, p229 - 247, October 2005, Springer Science+Business Media B.V."
vertex_attr(enron) %>%
lapply(head)
## $Email
## [1] "albert.meyers" "a..martin" "andrea.ring" "andrew.lewis"
## [5] "andy.zipper" "a..shankman"
##
## $Name
## [1] "Albert Meyers" "Thomas Martin" "Andrea Ring"
## [4] "Andrew Lewis" "Andy Zipper" "Jeffrey Shankman"
##
## $Note
## [1] "Employee, Specialist" "Vice President"
## [3] "NA" "Director"
## [5] "Vice President, Enron Online" "President, Enron Global Mkts"
There’s no name attribute (it has to be lower case), so let’s make one now. Remember, the name attribute has to be unique for each node.
vertex_attr(enron) %>%
lapply(function(x){x %>% unique %>% length})
## $Email
## [1] 184
##
## $Name
## [1] 126
##
## $Note
## [1] 42
So it seems that the Email attribute is the best candidate for the name because the number of unique Emails matches the number of unique nodes (184)
V(enron)$name <- V(enron)$Email
edge_attr(enron) %>%
lapply(head)
## $Time
## [1] "1979-12-31 21:00:00" "1979-12-31 21:00:00" "1979-12-31 21:00:00"
## [4] "1979-12-31 21:00:00" "1979-12-31 21:00:00" "1979-12-31 21:00:00"
##
## $Reciptype
## [1] "to" "to" "cc" "cc" "bcc" "bcc"
##
## $Topic
## [1] 1 1 3 3 3 3
##
## $LDC_topic
## [1] 0 -1 -1 -1 -1 -1
set.seed(1234)
ggraph(enron) +
geom_edge_fan(alpha = .1) +
geom_node_point()
## Using `nicely` as default layout
Let’s take a look at the the years really fast
E(enron)$Time %>%
str_extract('\\d{4}\\-\\d{2}') %>%
unique %>%
sort
## [1] "1979-12" "1998-11" "1998-12" "1999-01" "1999-02" "1999-03" "1999-04"
## [8] "1999-05" "1999-06" "1999-07" "1999-08" "1999-09" "1999-10" "1999-11"
## [15] "1999-12" "2000-01" "2000-02" "2000-03" "2000-04" "2000-05" "2000-06"
## [22] "2000-07" "2000-08" "2000-09" "2000-10" "2000-11" "2000-12" "2001-01"
## [29] "2001-02" "2001-03" "2001-04" "2001-05" "2001-06" "2001-07" "2001-08"
## [36] "2001-09" "2001-10" "2001-11" "2001-12" "2002-01" "2002-02" "2002-03"
## [43] "2002-04" "2002-05" "2002-06"
Did you know that Enron was founded in 1985? I think this might be something looking into. Let’s filter the graph down to only include edges from 1979.
old_enron <- enron %>%
`-`(E(.)[!str_detect(E(.)$Time, '^1979')]) %>%
`-`(V(.)[degree(.) < 1])
old_enron
## IGRAPH c756dab DN-- 43 174 -- Enron email network
## + attr: LDC_names (g/c), LDC_desc (g/c), name (g/c), Citation
## | (g/c), Email (v/c), Name (v/c), Note (v/c), name (v/c), Time
## | (e/c), Reciptype (e/c), Topic (e/n), LDC_topic (e/n)
## + edges from c756dab (vertex names):
## [1] daren.farmer ->sally.beck
## [2] daren.farmer ->sally.beck
## [3] debra.perlingiere->debra.perlingiere
## [4] debra.perlingiere->debra.perlingiere
## [5] debra.perlingiere->debra.perlingiere
## [6] debra.perlingiere->debra.perlingiere
## + ... omitted several edges
set.seed(4321)
ggraph(old_enron) +
geom_edge_fan(alpha = .2) +
geom_node_point() +
theme_void()
## Using `nicely` as default layout
V(old_enron)$Note %>% unique
## [1] "NA"
## [2] "Manager, Logistics Manager"
## [3] "CEO, Enron North America and Enron Enery Services"
## [4] "Vice President"
## [5] "Employee"
## [6] "President"
## [7] "In House Lawyer"
## [8] "Vice President, Government Affairs"
## [9] "Employee, Government Relation Executive"
## [10] "Managing Director"
## [11] "President, Enron Global Mkts"
## [12] "CEO, Enron America"
## [13] "CEO"
## [14] "Director, Pipeline Business"
## [15] "President, Enron Online"
## [16] "Managing Director, Legal Department"
## [17] "Employee, Administrative Asisstant"
## [18] "Manager"
## [19] "Vice President, Enron WholeSale Services"
## [20] "Vice President, Regulatory Affairs"
## [21] "Manager, Chief Risk Management Officer"
## [22] "Employee, Chief Operating Officer"
## [23] "Vice President, Vice President & Chief of Staff"
## [24] "Manager, Risk Management Head"
V(old_enron)$pr <- page_rank(old_enron)$vector
V(old_enron)$isExec <- str_detect(V(old_enron)$Note, 'Chief|President|CEO')
set.seed(4321)
ggraph(old_enron) +
geom_edge_fan(alpha = .2) +
geom_edge_loop(alpha = .2) +
geom_node_point(aes(size = pr,
fill = isExec),
shape = 21) +
theme_void() +
scale_fill_manual(values = c("TRUE" = "#8dd3c7", "FALSE" = "#bebada"))
## Using `nicely` as default layout
You should see a major problem with the graph above. The nodes with the largest PageRank are the ones with loops. This is because the algorithm uses a random walker placed on a random node. If the walker lands starts on a looping node, then it will always end up the same node no matter how many steps the walker takes. We need to get rid of the loops if we want an accurate PageRank value. We’ll need to use simplify to get rid of multiple edges between nodes and to get rid of loops. However, we want to keep the multiple edges
old_enron <- simplify(old_enron, remove.multiple = F) %>%
`-`(V(.)[degree(.) < 1])
Now let’s redo everything:
V(old_enron)$pr <- page_rank(old_enron)$vector
V(old_enron)$pr_label <- mapply(function(x,y){ifelse(x > quantile(V(old_enron)$pr, .9), y, '')}, V(old_enron)$pr, V(old_enron)$Email)
set.seed(4321)
ggraph(old_enron) +
geom_edge_fan(alpha = .2) +
geom_edge_loop(alpha = .2) +
geom_node_point(aes(size = pr,
fill = isExec),
shape = 21) +
geom_node_text(aes(label = pr_label)) +
theme_void() +
scale_fill_manual(values = c("TRUE" = "#8dd3c7", "FALSE" = "#bebada"))
## Using `nicely` as default layout
useSummaryStat <- function(graph, label, graphStat, strAttr = 'Email'){
graph %>%
set_vertex_attr(label, value = graphStat(.)$vector) %>%
{
scores <- vertex_attr(., label)
pullText <- vertex_attr(., strAttr)
set_vertex_attr(.,
paste0(label, '_label'),
value = mapply(function(x,y){ifelse(x > quantile(scores, .9), y, '')}, scores, pullText))
}
}
plotEnron <- function(graph, label){
set.seed(4321)
ggraph(graph) +
geom_edge_fan(alpha = .2) +
geom_edge_loop(alpha = .2) +
geom_node_point(aes_string(size = label,
fill = "isExec"),
shape = 21) +
geom_node_text(aes_string(label = paste0(label, '_label'))) +
theme_void() +
scale_fill_manual(values = c("TRUE" = "#8dd3c7", "FALSE" = "#bebada"))
}
useSummaryStat(old_enron, 'auth', authority_score, 'Note') %>%
plotEnron('auth')
## Using `nicely` as default layout
useSummaryStat(old_enron, 'hub', hub_score, 'Note') %>%
plotEnron('hub')
## Using `nicely` as default layout